Aquifer

Petrignano

Data preparation

df <- as.data.frame(read.table('./acea-water-prediction/Aquifer_Petrignano.csv', header = TRUE, sep=','))
head(df, n=20)
##       п.їDate Rainfall_Bastia_Umbra Depth_to_Groundwater_P24
## 1  14/03/2006                    NA                   -22.48
## 2  15/03/2006                    NA                   -22.38
## 3  16/03/2006                    NA                   -22.25
## 4  17/03/2006                    NA                   -22.38
## 5  18/03/2006                    NA                   -22.60
## 6  19/03/2006                    NA                   -22.35
## 7  20/03/2006                    NA                   -22.50
## 8  21/03/2006                    NA                   -22.31
## 9  22/03/2006                    NA                   -22.31
## 10 23/03/2006                    NA                   -22.40
## 11 24/03/2006                    NA                   -22.32
## 12 25/03/2006                    NA                   -22.25
## 13 26/03/2006                    NA                   -22.15
## 14 27/03/2006                    NA                   -22.47
## 15 28/03/2006                    NA                   -22.27
## 16 29/03/2006                    NA                   -22.52
## 17 30/03/2006                    NA                   -22.50
## 18 31/03/2006                    NA                   -22.70
## 19 01/04/2006                    NA                   -22.77
## 20 02/04/2006                    NA                   -22.49
##    Depth_to_Groundwater_P25 Temperature_Bastia_Umbra Temperature_Petrignano
## 1                    -22.18                       NA                     NA
## 2                    -22.14                       NA                     NA
## 3                    -22.04                       NA                     NA
## 4                    -22.04                       NA                     NA
## 5                    -22.04                       NA                     NA
## 6                    -21.95                       NA                     NA
## 7                    -21.99                       NA                     NA
## 8                    -21.89                       NA                     NA
## 9                    -21.82                       NA                     NA
## 10                   -21.89                       NA                     NA
## 11                   -21.89                       NA                     NA
## 12                   -21.82                       NA                     NA
## 13                   -21.79                       NA                     NA
## 14                   -21.83                       NA                     NA
## 15                   -21.75                       NA                     NA
## 16                   -21.81                       NA                     NA
## 17                   -21.84                       NA                     NA
## 18                   -21.87                       NA                     NA
## 19                   -21.90                       NA                     NA
## 20                   -21.83                       NA                     NA
##    Volume_C10_Petrignano Hydrometry_Fiume_Chiascio_Petrignano
## 1                     NA                                   NA
## 2                     NA                                   NA
## 3                     NA                                   NA
## 4                     NA                                   NA
## 5                     NA                                   NA
## 6                     NA                                   NA
## 7                     NA                                   NA
## 8                     NA                                   NA
## 9                     NA                                   NA
## 10                    NA                                   NA
## 11                    NA                                   NA
## 12                    NA                                   NA
## 13                    NA                                   NA
## 14                    NA                                   NA
## 15                    NA                                   NA
## 16                    NA                                   NA
## 17                    NA                                   NA
## 18                    NA                                   NA
## 19                    NA                                   NA
## 20                    NA                                   NA

Remove NA elements from data

df <- df[complete.cases(df$Rainfall_Bastia_Umbra),]
rownames(df) <- 1:nrow(df)
df <- subset(df, select = -c(Depth_to_Groundwater_P25, Temperature_Petrignano ))
colnames(df) <- c("Date", "Rainfall", "Depth_to_Groundwater","Temperature", " Volume", " Hydrometry" )
head(df, n=20)
##          Date Rainfall Depth_to_Groundwater Temperature    Volume  Hydrometry
## 1  01/01/2009      0.0               -31.96         5.2 -24530.69         2.4
## 2  02/01/2009      0.0               -32.03         2.3 -28785.89         2.5
## 3  03/01/2009      0.0               -31.97         4.4 -25766.21         2.4
## 4  04/01/2009      0.0               -31.91         0.8 -27919.30         2.4
## 5  05/01/2009      0.0               -31.94        -1.9 -29854.66         2.3
## 6  06/01/2009      0.0               -31.89        -0.7 -29124.58         2.3
## 7  07/01/2009      0.0               -31.91         1.5 -31173.12         2.3
## 8  08/01/2009      0.0               -31.83         4.3 -30232.22         2.4
## 9  09/01/2009      0.9               -31.80         4.9 -30597.70         2.3
## 10 10/01/2009      0.0               -31.76         1.9 -31337.28         2.3
## 11 11/01/2009      0.0               -31.70         3.4 -29845.15         2.3
## 12 12/01/2009      0.0               -31.57         3.3 -28745.28         2.3
## 13 13/01/2009      1.1               -31.54         6.0 -28932.77         2.3
## 14 14/01/2009      0.0               -31.44         8.2 -28600.13         2.3
## 15 15/01/2009      0.0               -31.26         7.8 -24973.06         2.3
## 16 16/01/2009      0.0               -31.50         5.1 -31388.26         2.3
## 17 17/01/2009      0.0               -31.61         1.6 -30941.57         2.3
## 18 18/01/2009      0.1               -31.15         6.4 -23043.74         2.3
## 19 19/01/2009      0.1               -30.95        10.3 -21658.75         2.3
## 20 20/01/2009      0.0               -30.93        12.1 -23793.70         2.3

Plot data

library('ggplot2')
library('forecast')
library('zoo')
library('dplyr')
library('data.table')
library('imputeTS')
library('xts')
library('tseries')
library('stats')
library('nlme')
library('fpp')
library('lubridate')
library('TSstudio')
library('AICcmodavg')

Data preparation

# sort by date
df$Date <- as.Date(df$Date, format= "%d/%m/%Y")
df <- df[order(df$Date), ]
interval = df$Date - shift(df$Date, n=1, fill=NA, type="lag")
for(i in interval)
{
  
  if(isTRUE(i > 1) || is.null(i))
  {
    print(i)
  }
    
}

As we can see, there are some missing values that we need to fix. We can see it from stat and from graphics where red places or outliers.

# find that there are some missing values in data set
ggplot_na_distribution(df$Depth_to_Groundwater)

ggplot_na_distribution(df$Rainfall)

ggplot_na_distribution(df$Temperature)

ggplot_na_distribution(df$` Volume`)

ggplot_na_distribution(df$` Hydrometry`)

statsNA(df$Depth_to_Groundwater)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 39
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.929%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 4
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 9.75
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] "  Bin 1 (1050 values from 1 to 1050) :      0 NAs (0%)"
## [1] "  Bin 2 (1050 values from 1051 to 2100) :      28 NAs (2.67%)"
## [1] "  Bin 3 (1050 values from 2101 to 3150) :      0 NAs (0%)"
## [1] "  Bin 4 (1049 values from 3151 to 4199) :      11 NAs (1.05%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "21 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "7 NA in a row (occuring 2 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "21 NA in a row (occuring 1 times, making up for overall 21 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] "  4 NA in a row: 1 times"
## [1] "  7 NA in a row: 2 times"
## [1] "  21 NA in a row: 1 times"
statsNA(df$Rainfall)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 0
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 0
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 0
## [1] "-------------------------"
## [1] "No NAs in the time series."
## [1] "-------------------------"
## [1] "There are no NAs in the time series"
statsNA(df$Temperature)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 0
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 0
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 0
## [1] "-------------------------"
## [1] "No NAs in the time series."
## [1] "-------------------------"
## [1] "There are no NAs in the time series"
statsNA(df$` Volume`)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 1
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.0238%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 1
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 1
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] "  Bin 1 (1050 values from 1 to 1050) :      0 NAs (0%)"
## [1] "  Bin 2 (1050 values from 1051 to 2100) :      0 NAs (0%)"
## [1] "  Bin 3 (1050 values from 2101 to 3150) :      0 NAs (0%)"
## [1] "  Bin 4 (1049 values from 3151 to 4199) :      1 NAs (0.0953%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "1 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 1 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "1 NA in a row (occuring 1 times, making up for overall 1 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] "  1 NA in a row: 1 times"
statsNA(df$` Hydrometry`)
## [1] "Length of time series:"
## [1] 4199
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 0
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0%"
## [1] "-------------------------"
## [1] "Number of Gaps:"
## [1] 0
## [1] "-------------------------"
## [1] "Average Gap Size:"
## [1] 0
## [1] "-------------------------"
## [1] "No NAs in the time series."
## [1] "-------------------------"
## [1] "There are no NAs in the time series"

So we need to fix data

Interpolate data to fix missing values

times_dd <- ts(df$Depth_to_Groundwater, start = df$Date[1], frequency=1)
ggplot_na_distribution(times_dd)

times_dd <- na_interpolation(times_dd, option="spline")

After interpolation

ggplot_na_distribution(times_dd)

plot( times_dd,xlab = "Time", ylab = "Depth_to_Groundwater", main="Depth", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

Volume has some troubles

df$` Volume` <- ifelse(df$` Volume` == 0, NaN, df$` Volume`)
times_vv <- ts(df$` Volume`, start = df$Date[1], frequency=1)
ggplot_na_distribution(times_vv)

times_vv <- na_interpolation(times_vv, option="linear")
ggplot_na_distribution(times_vv)

plot( times_vv,xlab = "Time", ylab = "Volume", main="Volume", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

Interpolation can’t be realised due to there are no missing values, there are only 0 values, that are anomaly for for current data. So we can change 0 to nan and then interpolate

df$` Hydrometry` <- ifelse(df$` Hydrometry` == 0, NaN, df$` Hydrometry`)
times_hh <- ts(df$` Hydrometry`, start = df$Date[1], frequency=1)
ggplot_na_distribution(times_hh)

times_hh <- na_interpolation(times_hh, option="linear")
ggplot_na_distribution(times_hh)

plot( times_hh,xlab = "Time", ylab = "Hydrometry", main="Hydrometry", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

Resampling

the series can be assembled unevenly over time and to eliminate this resampling is used So if data was collected once an hour, and then suddenly once a day, and then once every 3 hours we will make it at equal intervals. But there we haven’t got any troubles that was checd by counting interval higher

try to unsderstand in what way and frequency it will be more efficient to predict

times_dd <- ts(df$Depth_to_Groundwater, start = df$Date[1], frequency=1)
times_dd <- na_interpolation(times_dd, option="spline")
time_dd_week <- period.apply(as.xts(times_dd), endpoints(as.xts(times_dd), "weeks"), range)
times_dd <- na_interpolation(time_dd_week, option="spline")
plot( time_dd_week,xlab = "Time", ylab = "Depth_to_Groundwater", main="Depth", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

times_t <- ts(df$Temperature, start = df$Date[1], frequency=1)
times_tt_week <- period.apply(as.xts(times_t), endpoints(as.xts(times_t), "weeks"), range)
plot( times_tt_week,xlab = "Time", ylab = "Temperature", main="Temperature", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

times_vv <- ts(df$` Volume`, start = df$Date[1], frequency=1)
times_vv_week <- period.apply(as.xts(times_vv), endpoints(as.xts(times_vv), "weeks"), range)
times_vv_week <- na_interpolation(times_vv_week, option="linear")
plot( times_vv_week,xlab = "Time", ylab = "Volume", main="Volume", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

times_hh <- ts(df$` Hydrometry`, start = df$Date[1], frequency=1)
times_hh_week <- period.apply(as.xts(times_hh), endpoints(as.xts(times_hh), "weeks"), range)
times_hh_week <- na_interpolation(times_hh_week, option="linear")
plot( times_hh_week,xlab = "Time", ylab = "Hydrometry", main="Hydrometry", axes=FALSE, type="l")
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

times <- ts(df$Rainfall, start = df$Date[1], frequency=1)
times_week <- period.apply(as.xts(times), endpoints(as.xts(times), "weeks"), range)
plot( times_week,xlab = "Time", ylab = "Rainfall", main="Rainfall", axes=FALSE)
axis(1,at=seq(df$Date[1], df$Date[length(df$Date)], by="years"),labels=unique(as.character(df$Date, format= "%Y")))
axis(2)
box()

We can say that it is not obvious to resample data, we can make predictions for one day and it will be quite accurate and full. If we will take for day or month we can lose some info that will be critical for people’ life. Also there is no such big outliers that require smoothing

Analyse data

#depth <- diff(depth)

depth <- na_interpolation(df$Depth_to_Groundwater, option = "linear")
depth_ts <- ts(depth, start = df$Date[1], frequency=365)

look at seasonal, because we can make some decisions from the seasonalty.

ts_seasonal(ts(na_interpolation(df$Depth_to_Groundwater, option = "linear"), start = df$Date[1], frequency=12), type = "all")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
ts_plot(depth_ts)
rainfall <- na_interpolation(df$Rainfall, option = "linear")
rainfall_ts <- ts(rainfall, start = df$Date[1], frequency=12)
rainfall_ts <- diff(rainfall_ts)
ts_seasonal(rainfall_ts, type = "all")

Rainfall ???

temperature <- na_interpolation(df$Temperature, option = "linear")
temperature_ts <- ts(temperature, start = df$Date[1], frequency=12)
temperature_ts <- diff(temperature_ts)
ts_seasonal(temperature_ts, type = "all")

From the plot we can see that the maximum temperature was in August, minimum in December

volume <- na_interpolation(df$` Volume`, option = "linear")
volume_ts <- ts(volume, start = df$Date[1], frequency=12)
volume_ts <- diff(volume_ts)
ts_seasonal(volume_ts, type = "all")

Maximum volume was in March, minimum in December

hydrometry <- na_interpolation(df$` Hydrometry`, option = "linear")
hydrometry_ts <- ts(hydrometry, start = df$Date[1], frequency=12)
hydrometry_ts <- diff(hydrometry_ts)
ts_seasonal(hydrometry_ts, type = "all")

Maximum in August, minimum in January.

The volume and hydrometry reached their minimum around the same time